{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import scipy.io\n",
    "from PIL import Image, ImageDraw\n",
    "import xml.etree.ElementTree as ET\n",
    "import pandas as pd\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DEVKIT_PATH = '/home/gpu2/hdd/dan/imagenet/ILSVRC2012_devkit_t12/'\n",
    "TRAIN_IMAGES_PATH = '/home/gpu2/hdd/dan/imagenet/train/'\n",
    "TRAIN_ANNOTATIONS_PATH = '/home/gpu2/hdd/dan/imagenet/annotations/'\n",
    "VAL_IMAGES_PATH = '/home/gpu2/hdd/dan/imagenet/val/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load label meanings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "meta = scipy.io.loadmat(os.path.join(DEVKIT_PATH, 'data/meta.mat'))\n",
    "meta = meta['synsets'][:, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_metadata = []\n",
    "for m in meta:\n",
    "    label, wordnet_id, human_readable, _, _, _, _, num_train_images = m\n",
    "\n",
    "    label = label[0, 0]\n",
    "    wordnet_id = wordnet_id[0]\n",
    "    human_readable = human_readable[0]\n",
    "    num_train_images = num_train_images[0, 0]\n",
    "    \n",
    "    if label <= 1000:\n",
    "        label_metadata.append((label, wordnet_id, human_readable, num_train_images))\n",
    "\n",
    "columns = ['integer_label', 'wordnet_id', 'human_readable', 'num_train_images']\n",
    "label_metadata = pd.DataFrame(label_metadata, columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wordnet_decoder = {T.wordnet_id: T.human_readable for T in label_metadata.itertuples()}\n",
    "id_decoder = {T.integer_label: T.wordnet_id for T in label_metadata.itertuples()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('number of training examples:', label_metadata.num_train_images.sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parse bounding boxes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_xml_annotation(xml_file):\n",
    "    \"\"\"Process a single XML file containing bounding boxes.\"\"\"\n",
    "\n",
    "    tree = ET.parse(xml_file)\n",
    "    root = tree.getroot()\n",
    "    width = float(root.find('size').find('width').text)\n",
    "    height = float(root.find('size').find('height').text)\n",
    "    filename = root.find('filename').text\n",
    "\n",
    "    boxes = []\n",
    "    for child in tree.getroot():\n",
    "        if child.tag == 'object':\n",
    "            bbox = child.find('bndbox')\n",
    "\n",
    "            xmin = float(bbox.find('xmin').text)\n",
    "            ymin = float(bbox.find('ymin').text)\n",
    "            xmax = float(bbox.find('xmax').text)\n",
    "            ymax = float(bbox.find('ymax').text)            \n",
    "            class_label = child.find('name').text\n",
    "\n",
    "            xmin = xmin / width\n",
    "            xmax = xmax / width\n",
    "            ymin = ymin / height\n",
    "            ymax = ymax / height\n",
    "\n",
    "            xmin = min(xmin, xmax)\n",
    "            xmax = max(xmin, xmax)\n",
    "            xmin = min(max(xmin, 0.0), 1.0)\n",
    "            xmax = min(max(xmax, 0.0), 1.0)\n",
    "\n",
    "            ymin = min(ymin, ymax)\n",
    "            ymax = max(ymin, ymax)\n",
    "            ymin = min(max(ymin, 0.0), 1.0)\n",
    "            ymax = min(max(ymax, 0.0), 1.0)\n",
    "\n",
    "            boxes.append((xmin, ymin, xmax, ymax, class_label))\n",
    "\n",
    "    return boxes, filename"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xml_files = glob.glob(os.path.join(TRAIN_ANNOTATIONS_PATH, '*/*.xml'))\n",
    "print('number of annotations:', len(xml_files))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "all_wordnet_ids = list(wordnet_decoder.keys())\n",
    "\n",
    "skipped_boxes = 0\n",
    "skipped_files = 0\n",
    "saved_boxes = 0\n",
    "saved_files = 0\n",
    "\n",
    "annotations = []\n",
    "weird_files = []\n",
    "for path in tqdm(xml_files):\n",
    "    \n",
    "    folder = os.path.basename(os.path.dirname(path))\n",
    "    assert folder in all_wordnet_ids\n",
    "\n",
    "    boxes, filename = process_xml_annotation(path)\n",
    "    assert len(boxes) > 0\n",
    "    \n",
    "    image_filename = os.path.splitext(os.path.basename(path))[0]\n",
    "    if filename != image_filename:\n",
    "        weird_files.append(path)\n",
    "\n",
    "    found_box = False\n",
    "    for box in boxes:\n",
    "        xmin, ymin, xmax, ymax, label = box\n",
    "\n",
    "        if label != folder:\n",
    "            skipped_boxes += 1\n",
    "            weird_files.append(path)\n",
    "            continue\n",
    "\n",
    "        if (xmin >= xmax) or (ymin >= ymax):\n",
    "            skipped_boxes += 1\n",
    "            weird_files.append(path)\n",
    "            continue\n",
    "        \n",
    "        annotations.append((image_filename, xmin, ymin, xmax, ymax))\n",
    "\n",
    "        saved_boxes += 1\n",
    "        found_box = True\n",
    "\n",
    "    if found_box:\n",
    "        saved_files += 1\n",
    "    else:\n",
    "        skipped_files += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(saved_files, saved_boxes, skipped_files, skipped_boxes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotations = pd.DataFrame(annotations, columns=['just_name', 'xmin', 'ymin', 'xmax', 'ymax'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect paths to training examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# collect all paths to images\n",
    "\n",
    "training = []\n",
    "for path, subdirs, files in tqdm(os.walk(TRAIN_IMAGES_PATH)):\n",
    "    for name in files:\n",
    "        training.append(os.path.join(path, name))\n",
    "        \n",
    "training = pd.DataFrame(training, columns=['path'])\n",
    "training['wordnet_id'] = training.path.apply(lambda x: x.split('/')[-2])\n",
    "training['just_name'] = training.path.apply(lambda x: x.split('/')[-1][:-5])\n",
    "\n",
    "# unique file extensions\n",
    "training.path.apply(lambda x: x.split('.')[-1]).unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert training.wordnet_id.isin(all_wordnet_ids).all()\n",
    "assert annotations.just_name.isin(training.just_name).all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('number of training examples:', len(training))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Show some training images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = np.random.randint(0, len(training))\n",
    "p = training.loc[i].path\n",
    "print(wordnet_decoder[training.loc[i].wordnet_id])\n",
    "Image.open(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def draw_boxes(image, boxes):\n",
    "    \n",
    "    width, height = image.size\n",
    "    scaler = np.array([width, height, width, height])\n",
    "    boxes = boxes*scaler\n",
    "\n",
    "    image_copy = image.copy()\n",
    "    draw = ImageDraw.Draw(image_copy, 'RGBA')\n",
    "\n",
    "    for box in boxes:\n",
    "        xmin, ymin, xmax, ymax = box\n",
    "\n",
    "        fill = (255, 0, 0, 75)\n",
    "        outline = 'black'\n",
    "\n",
    "        draw.rectangle(\n",
    "            [(xmin, ymin), (xmax, ymax)],\n",
    "            fill=fill, outline=outline\n",
    "        )\n",
    "\n",
    "    return image_copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "i = np.random.randint(0, len(annotations))\n",
    "just_name = annotations.loc[i, 'just_name']\n",
    "wordnet_id = just_name.split('_')[0]\n",
    "\n",
    "image = Image.open(os.path.join(TRAIN_IMAGES_PATH, wordnet_id, just_name + '.JPEG'))\n",
    "boxes = annotations.loc[annotations['just_name'] == just_name, ['xmin', 'ymin', 'xmax', 'ymax']].values\n",
    "\n",
    "print(wordnet_decoder[wordnet_id])\n",
    "draw_boxes(image, boxes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get labels for validation images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get labels for validation images\n",
    "\n",
    "with open(os.path.join(DEVKIT_PATH, 'data/ILSVRC2012_validation_ground_truth.txt')) as f:\n",
    "    content = f.readlines()\n",
    "\n",
    "content = [int(s.strip()) for s in content]\n",
    "content = [id_decoder[i] for i in content]\n",
    "len(content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "validation = []\n",
    "for name in os.listdir(VAL_IMAGES_PATH):\n",
    "    number = name.split('_')[-1].split('.')[0]\n",
    "    number = int(number.lstrip())\n",
    "    validation.append((os.path.join(VAL_IMAGES_PATH, name), number))\n",
    "\n",
    "validation = sorted(validation, key=lambda x: x[1])\n",
    "validation = [p for p, _ in validation]\n",
    "\n",
    "validation = pd.DataFrame(validation, columns=['path'])\n",
    "validation['wordnet_id'] = content"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Show some validation images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "i = np.random.randint(0, len(validation))\n",
    "p = validation.loc[i].path\n",
    "print(wordnet_decoder[validation.loc[i].wordnet_id])\n",
    "Image.open(p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training.to_csv('training_metadata.csv', index=False)\n",
    "validation.to_csv('validation_metadata.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "boxes = {}\n",
    "for T in annotations.itertuples():\n",
    "    if T.just_name in boxes:\n",
    "        boxes[T.just_name] += [(T.xmin, T.ymin, T.xmax, T.ymax)]\n",
    "    else:\n",
    "        boxes[T.just_name] = [(T.xmin, T.ymin, T.xmax, T.ymax)]\n",
    "        \n",
    "for name in boxes:\n",
    "    boxes[name] = np.array(boxes[name], dtype='float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save('boxes.npy', boxes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('wordnet_decoder.json', 'w') as f:\n",
    "    json.dump(wordnet_decoder, f, indent=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this will be used when training\n",
    "encoding = {n: i - 1 for i, n in id_decoder.items()}\n",
    "# integer labels are from 0 to 999"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('integer_encoding.json', 'w') as f:\n",
    "    json.dump(encoding, f, indent=0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
